{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "import re\n",
    "from IPython.display import display, Math, Latex, Image\n",
    "from PIL import Image\n",
    "import random\n",
    "from tqdm import tqdm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Generate vocabulary and char_to_idx, idx_to_char dictionaries"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "502\n"
     ]
    }
   ],
   "source": [
    "vocab = open(\"im2markup/latex_vocab.txt\").readlines()\n",
    "formulae = open(\"im2markup/formulas.norm.lst\",'r').readlines()\n",
    "char_to_idx = {x.split('\\n')[0]:i for i,x in enumerate(vocab)}\n",
    "# print len(char_to_idx)\n",
    "char_to_idx['#UNK'] = len(char_to_idx)\n",
    "char_to_idx['#START'] = len(char_to_idx)\n",
    "char_to_idx['#END'] = len(char_to_idx)\n",
    "idx_to_char = {y:x for x,y in char_to_idx.iteritems()}\n",
    "# print char_to_idx['#UNK']\n",
    "# print char_to_idx['#START']\n",
    "# print char_to_idx['#END']\n",
    "print len(char_to_idx)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Create train, test, validate files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\r",
      "  0%|          | 0/10355 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\rightarrowfill  not found!\n",
      "\\verb  not found!\n",
      "\\right/  not found!\n",
      "\\vector  not found!\n",
      "\\bigskip  not found!\n",
      "\\oval  not found!\n",
      "\\mathtt  not found!\n",
      "\\atopwithdelims  not found!\n",
      "\\textcircled  not found!\n",
      "\\scshape  not found!\n",
      "\\symbol  not found!\n",
      "\\uppercase  not found!\n",
      "\\expandafter  not found!\n",
      "\\romannumeral  not found!\n",
      "\\preceq  not found!\n",
      "\\triangleleft  not found!\n",
      "\\]  not found!\n",
      "\\[  not found!\n",
      "\\nulldelimiterspace  not found!\n",
      "\\everymath  not found!\n",
      "\\searrow  not found!\n",
      "\\tabcolsep  not found!\n",
      "\\coprod  not found!\n",
      "\\\"  not found!\n",
      "\\nearrow  not found!\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 10355/10355 [00:00<00:00, 24056.33it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Num files found in test set: 9695/10355\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "set = \"test\" # Use train, valid or test to generate corresponding files\n",
    "file_list = open(\"im2markup/\"+set+\".lst\",'r').readlines()\n",
    "set_list = []\n",
    "missing = {}\n",
    "for i,line in enumerate(file_list):\n",
    "    form = formulae[int(line.split()[1])].strip().split()\n",
    "    out_form = [char_to_idx['#START']]\n",
    "    for c in form:\n",
    "        try:\n",
    "            out_form += [char_to_idx[c]]\n",
    "        except:\n",
    "            if c not in missing:\n",
    "                print c, \" not found!\"\n",
    "                missing[c] = 1\n",
    "            else:\n",
    "                missing[c] += 1\n",
    "            out_form += [char_to_idx['#UNK']]\n",
    "    out_form += [char_to_idx['#END']]\n",
    "    set_list.append([line.split()[0],out_form])\n",
    "    \n",
    "buckets = {}\n",
    "import os\n",
    "file_not_found_count = 0\n",
    "for x,y in tqdm(set_list):\n",
    "    if os.path.exists('./images_processed/'+x): \n",
    "        img_shp = Image.open('./images_processed/'+x).size\n",
    "        try:\n",
    "            buckets[img_shp] += [(x,y)]\n",
    "        except:\n",
    "            buckets[img_shp] = [(x,y)]\n",
    "    else:\n",
    "        file_not_found_count += 1\n",
    "\n",
    "print \"Num files found in %s set: %d/%d\"%(set,len(set_list)-file_not_found_count,len(set_list)) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'\\\\right/': 3, '\\\\everymath': 1, '\\\\searrow': 2, '\\\\triangleleft': 3, '\\\\preceq': 1, '\\\\nearrow': 1, '\\\\vector': 3, '\\\\symbol': 1, '\\\\scshape': 1, '\\\\\"': 1, '\\\\oval': 2, '\\\\coprod': 1, '\\\\nulldelimiterspace': 1, '\\\\atopwithdelims': 1, '\\\\textcircled': 1, '\\\\tabcolsep': 2, '\\\\]': 1, '\\\\[': 1, '\\\\mathtt': 3, '\\\\verb': 1, '\\\\rightarrowfill': 1, '\\\\expandafter': 1, '\\\\uppercase': 1, '\\\\bigskip': 3, '\\\\romannumeral': 1}\n",
      "Max length of sequence:  627\n",
      "Testing!\n",
      "('20032b2645.png', [500, 39, 4, 488, 5, 498, 35, 498, 281, 495, 57, 469, 486, 497, 444, 462, 495, 490, 35, 488, 497, 461, 495, 497, 245, 495, 205, 495, 20, 497, 495, 490, 9, 488, 497, 497, 74, 57, 4, 488, 8, 490, 5, 161, 50, 4, 490, 5, 352, 7, 74, 495, 413, 495, 205, 495, 20, 497, 495, 21, 497, 497, 497, 74, 456, 4, 21, 488, 5, 74, 50, 4, 488, 5, 501])\n"
     ]
    }
   ],
   "source": [
    "print missing\n",
    "print \"Max length of sequence: \",max([len(x[1]) for x in set_list])\n",
    "## test\n",
    "print \"Testing!\"\n",
    "print buckets[random.choice(buckets.keys())][0]\n",
    "np.save(set+'_buckets',buckets)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "properties = {}\n",
    "properties['vocab_size'] = len(vocab)\n",
    "properties['vocab'] = vocab\n",
    "properties['char_to_idx'] = char_to_idx\n",
    "properties['idx_to_char'] = idx_to_char\n",
    "import numpy as np\n",
    "np.save('properties',properties)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I ( p ^ { 2 } , m _ { 2 } ^ { 2 } ) = N _ { d } ( - \\frac { 1 } { \\epsilon } + O ( \\epsilon ) ) ( 1 - \\frac { d } { 2 } ) \\int _ { 0 } ^ { 1 } d t ( \\frac { p ^ { 2 } } { m _ { 2 } ^ { 2 } } t ( 1 - t ) + 1 ) ^ { \\frac { d } { 2 } - 2 } ,\n",
      "\n",
      "\n",
      "\\alpha ( z ) = \\alpha _ { 0 } + \\tau \\alpha _ { 1 } + \\tau ^ { 2 } \\alpha _ { 2 } + \\ldots \\quad .\n",
      "\n",
      "\n",
      "{ \\cal L } = R - \\Lambda e ^ { a \\phi } - \\frac { 4 } { 3 } ( \\nabla \\phi ) ^ { 2 } - V e ^ { b \\phi } \\delta ( y )\n",
      "\n",
      "\n",
      "q _ { 6 } = \\frac 1 6 ( \\mathrm { t r } \\, M ) ^ { 3 } \\pm \\frac 1 2 \\mathrm { t r } \\, M \\, \\mathrm { t r } \\, M ^ { 2 } + \\frac 1 3 \\mathrm { t r } \\, M ^ { 3 } .\n",
      "\n",
      "\n",
      "\\Gamma _ { 0 } ^ { + } = - \\Gamma _ { 0 } , \\qquad \\Gamma _ { M } ^ { + } = \\Gamma _ { M } , \\qquad \\Gamma _ { M } ^ { T } = ( - 1 ) ^ { M + 1 } \\Gamma _ { M } .\n",
      "\n",
      "\n",
      "\\left( \\sum \\alpha _ { i } f _ { i } \\right) \\Rightarrow \\frac { 1 } { \\prod m _ { i } } \\sqrt { \\frac { \\Lambda ^ { ( N ) } } { D ^ { ( N ) } } } \\; \\gamma _ { N } = \\frac { 1 } { m _ { 0 } } \\; \\gamma _ { N } ,\n",
      "\n",
      "\n",
      "d i m ~ E _ { ( s ) } = 1 \\cdot \\phi \\cdot \\phi ^ { 2 } \\dot { . } . . \\phi ^ { s - 1 } = \\phi ^ { s ( s - 1 ) / 2 } ,\n",
      "\n",
      "\n",
      "\\begin{array} { c } { H _ { i j } = K _ { i j } = F _ { i j } ( i I ) , } \\\\ { D _ { i } ^ { A } \\Phi _ { A } = D _ { i } ^ { B } \\Phi _ { B } = E _ { i } ( i I ) . } \\\\ \\end{array}\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "f = open(\"im2markup/formulas.norm.lst\").readlines()\n",
    "np.random.seed(1234)\n",
    "import numpy as np\n",
    "for i in xrange(8):\n",
    "    print np.random.choice(f)\n",
    "    print"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}
